import pandas as pd
import numpy as np


def preprocess_airbnb_data(file_path):
    """
    加载并预处理AB_NYC_2019数据集
    """
    # 加载数据
    df = pd.read_csv(file_path)

    # 处理价格列
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

    # 处理经纬度
    df = df.dropna(subset=['latitude', 'longitude'])

    # 过滤异常值
    df = df[(df['price'] > 0) & (df['price'] < 1000) & (df['minimum_nights'] < 30)]

    # 添加标记字段
    df['price_category'] = pd.cut(df['price'],
                                  bins=[0, 100, 200, 300, np.inf],
                                  labels=['<$100', '$100-200', '$200-300', '$300+'])

    print(f"处理后的数据集包含 {len(df)} 条记录")
    return df